Estimating a continuous target with Regression and Regression Tree¶

In [1]:
# === CELL 0: imports & plotting style (drop-in) ===
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.tree import DecisionTreeRegressor, plot_tree
from sklearn.linear_model import LinearRegression

import statsmodels.formula.api as smf
import statsmodels.api as sm

pd.set_option('display.float_format', lambda x: f'{x:,.3f}')
sns.set(context="notebook", style="whitegrid")
In [2]:
df=pd.read_csv('/Users/connorross/Downloads/Movie_regression.csv', index_col=0)

1. EDA ---------------------------------------------------------------------------¶

In [3]:
df.shape
Out[3]:
(506, 17)
In [4]:
df.head()  # if want to drop a column then df.drop(["columnname"], axis=1, inplace=True)
Out[4]:
Production expense Multiplex coverage Budget Movie_length Lead_ Actor_Rating Lead_Actress_rating Director_rating Producer_rating Critic_rating Trailer_views 3D_available Time_taken Twitter_hastags Genre Avg_age_actors Num_multiplex Collection
Marketing expense
20.126 59.620 0.462 36,524.125 138.700 7.825 8.095 7.910 7.995 7.940 527367 YES 109.600 223.840 Thriller 23 494 48000
20.546 69.140 0.531 35,668.655 152.400 7.505 7.650 7.440 7.470 7.440 494055 NO 146.640 243.456 Drama 42 462 43200
20.546 69.140 0.531 39,912.675 134.600 7.485 7.570 7.495 7.515 7.440 547051 NO 147.880 2,022.400 Comedy 38 458 69400
20.647 59.360 0.542 38,873.890 119.300 6.895 7.035 6.920 7.020 8.260 516279 YES 185.360 225.344 Drama 45 472 66800
21.381 59.360 0.542 39,701.585 127.700 6.920 7.070 6.815 7.070 8.260 531448 NO 176.480 225.792 Drama 55 395 72400
In [5]:
df.isnull().sum() # if wish to drop nulls then df.dropna(inplace=True)
Out[5]:
Production expense      0
Multiplex coverage      0
Budget                  0
Movie_length            0
Lead_ Actor_Rating      0
Lead_Actress_rating     0
Director_rating         0
Producer_rating         0
Critic_rating           0
Trailer_views           0
3D_available            0
Time_taken             12
Twitter_hastags         0
Genre                   0
Avg_age_actors          0
Num_multiplex           0
Collection              0
dtype: int64
In [6]:
df.info()
<class 'pandas.core.frame.DataFrame'>
Index: 506 entries, 20.1264 to 20.9482
Data columns (total 17 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Production expense   506 non-null    float64
 1   Multiplex coverage   506 non-null    float64
 2   Budget               506 non-null    float64
 3   Movie_length         506 non-null    float64
 4   Lead_ Actor_Rating   506 non-null    float64
 5   Lead_Actress_rating  506 non-null    float64
 6   Director_rating      506 non-null    float64
 7   Producer_rating      506 non-null    float64
 8   Critic_rating        506 non-null    float64
 9   Trailer_views        506 non-null    int64  
 10  3D_available         506 non-null    object 
 11  Time_taken           494 non-null    float64
 12  Twitter_hastags      506 non-null    float64
 13  Genre                506 non-null    object 
 14  Avg_age_actors       506 non-null    int64  
 15  Num_multiplex        506 non-null    int64  
 16  Collection           506 non-null    int64  
dtypes: float64(11), int64(4), object(2)
memory usage: 71.2+ KB
In [10]:
# Check missing values
print("Missing values before fill:\n", df['Time_taken'].isna().sum())

# Fill missing numeric with median (keeps distribution shape)
df['Time_taken'] = df['Time_taken'].fillna(df['Time_taken'].median())

print("Missing values after fill:\n", df['Time_taken'].isna().sum())
Missing values before fill:
 12
Missing values after fill:
 0
In [11]:
df.info()
<class 'pandas.core.frame.DataFrame'>
Index: 506 entries, 20.1264 to 20.9482
Data columns (total 17 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Production expense   506 non-null    float64
 1   Multiplex coverage   506 non-null    float64
 2   Budget               506 non-null    float64
 3   Movie_length         506 non-null    float64
 4   Lead_ Actor_Rating   506 non-null    float64
 5   Lead_Actress_rating  506 non-null    float64
 6   Director_rating      506 non-null    float64
 7   Producer_rating      506 non-null    float64
 8   Critic_rating        506 non-null    float64
 9   Trailer_views        506 non-null    int64  
 10  3D_available         506 non-null    object 
 11  Time_taken           506 non-null    float64
 12  Twitter_hastags      506 non-null    float64
 13  Genre                506 non-null    object 
 14  Avg_age_actors       506 non-null    int64  
 15  Num_multiplex        506 non-null    int64  
 16  Collection           506 non-null    int64  
dtypes: float64(11), int64(4), object(2)
memory usage: 71.2+ KB
In [ ]:
df.describe()
Out[ ]:
Production expense Multiplex coverage Budget Movie_length Lead_ Actor_Rating Lead_Actress_rating Director_rating Producer_rating Critic_rating Trailer_views Time_taken Twitter_hastags Avg_age_actors Num_multiplex Collection
count 506.000 506.000 506.000 506.000 506.000 506.000 506.000 506.000 506.000 506.000 494.000 506.000 506.000 506.000 506.000
mean 77.274 0.445 34,911.144 142.075 8.014 8.186 8.020 8.191 7.811 449,860.715 157.391 260.832 39.182 545.043 45,057.708
std 13.721 0.116 3,903.038 28.149 1.054 1.054 1.060 1.050 0.660 68,917.763 31.295 104.779 12.514 106.333 18,364.352
min 55.920 0.129 19,781.355 76.400 3.840 4.035 3.840 4.030 6.600 212,912.000 0.000 201.152 3.000 333.000 10,000.000
25% 65.380 0.376 32,693.952 118.525 7.316 7.504 7.296 7.508 7.200 409,128.000 132.300 223.796 28.000 465.000 34,050.000
50% 74.380 0.462 34,488.217 151.000 8.308 8.495 8.312 8.465 7.960 462,460.000 160.000 254.400 39.000 535.500 42,400.000
75% 91.200 0.551 36,793.542 167.575 8.865 9.030 8.884 9.030 8.260 500,247.500 181.890 283.416 50.000 614.750 50,000.000
max 110.480 0.615 48,772.900 173.500 9.435 9.540 9.425 9.635 9.400 567,784.000 217.520 2,022.400 60.000 868.000 100,000.000
In [9]:
# === CELL 3: featured statistics (summary table) ===
# Numeric summary
desc_num = df.select_dtypes(include=[np.number]).describe().T
desc_num['skew'] = df.select_dtypes(include=[np.number]).skew()
desc_num['kurtosis'] = df.select_dtypes(include=[np.number]).kurtosis()
desc_num
Out[9]:
count mean std min 25% 50% 75% max skew kurtosis
Production expense 506.000 77.274 13.721 55.920 65.380 74.380 91.200 110.480 0.295 -1.234
Multiplex coverage 506.000 0.445 0.116 0.129 0.376 0.462 0.551 0.615 -0.729 -0.065
Budget 506.000 34,911.144 3,903.038 19,781.355 32,693.952 34,488.217 36,793.542 48,772.900 0.404 1.892
Movie_length 506.000 142.075 28.149 76.400 118.525 151.000 167.575 173.500 -0.599 -0.968
Lead_ Actor_Rating 506.000 8.014 1.054 3.840 7.316 8.308 8.865 9.435 -1.011 0.498
Lead_Actress_rating 506.000 8.186 1.054 4.035 7.504 8.495 9.030 9.540 -1.007 0.473
Director_rating 506.000 8.020 1.060 3.840 7.296 8.312 8.884 9.425 -1.004 0.458
Producer_rating 506.000 8.191 1.050 4.030 7.508 8.465 9.030 9.635 -1.005 0.503
Critic_rating 506.000 7.811 0.660 6.600 7.200 7.960 8.260 9.400 0.176 -0.752
Trailer_views 506.000 449,860.715 68,917.763 212,912.000 409,128.000 462,460.000 500,247.500 567,784.000 -0.844 0.489
Time_taken 494.000 157.391 31.295 0.000 132.300 160.000 181.890 217.520 -0.473 1.114
Twitter_hastags 506.000 260.832 104.779 201.152 223.796 254.400 283.416 2,022.400 13.791 214.232
Avg_age_actors 506.000 39.182 12.514 3.000 28.000 39.000 50.000 60.000 0.013 -1.200
Num_multiplex 506.000 545.043 106.333 333.000 465.000 535.500 614.750 868.000 0.534 -0.121
Collection 506.000 45,057.708 18,364.352 10,000.000 34,050.000 42,400.000 50,000.000 100,000.000 1.111 1.517
In [12]:
TARGET = 'Collection'

fig, axes = plt.subplots(1, 2, figsize=(12, 4))
sns.histplot(df[TARGET].dropna(), kde=True, ax=axes[0])
axes[0].set_title(f'{TARGET} Distribution')

# log1p (safe if zeros present)
sns.histplot(np.log1p(df[TARGET].clip(lower=0)).dropna(), kde=True, ax=axes[1])
axes[1].set_title(f'log1p({TARGET}) Distribution')
plt.tight_layout()

skew_raw = df[TARGET].skew()
skew_log = np.log1p(df[TARGET].clip(lower=0)).skew()
print(f"Skew (raw) = {skew_raw:0.3f} | Skew (log1p) = {skew_log:0.3f}")
Skew (raw) = 1.111 | Skew (log1p) = -0.335
No description has been provided for this image
In [14]:
# === CELL 5: correlation matrix heatmap (numeric features only) ===
num_df = df.select_dtypes(include=[np.number])
plt.figure(figsize=(12, 10))
corr = num_df.corr(numeric_only=True)
sns.heatmap(corr, annot=False, cmap='coolwarm', center=0, square=False)
plt.title('Correlation Matrix (Numeric Features)')
plt.show()

# Optional: print top correlations with target
print("Top |corr| with target:")
print(corr[TARGET].drop(TARGET).abs().sort_values(ascending=False).head(12))
No description has been provided for this image
Top |corr| with target:
Trailer_views         0.720
Budget                0.696
Production expense    0.485
Multiplex coverage    0.429
Num_multiplex         0.392
Movie_length          0.378
Critic_rating         0.341
Lead_ Actor_Rating    0.251
Lead_Actress_rating   0.249
Producer_rating       0.248
Director_rating       0.247
Time_taken            0.109
Name: Collection, dtype: float64
In [16]:
sns.pairplot(data=df) # makes a scatterplot matrix
Out[16]:
<seaborn.axisgrid.PairGrid at 0x31902d950>
No description has been provided for this image

2. Linear Regression with all data using statmodels ---------------------------------¶

You can analyze each input feature individually (nice exploration but usually skip these steps and do the full model)¶

In [11]:
# https://www.statsmodels.org/stable/examples/notebooks/generated/ols.html
In [17]:
df_model = df.copy()

# Make sure categorical columns are of type 'category' or string
for col in ['3D_available', 'Genre']:
    if col in df_model.columns:
        df_model[col] = df_model[col].astype('category')

# One-hot encode (drop_first to avoid dummy trap in sklearn)
cat_cols = [c for c in ['3D_available', 'Genre'] if c in df_model.columns]
df_dum = pd.get_dummies(df_model, columns=cat_cols, drop_first=True, dtype=int)

print("Shape after dummies:", df_dum.shape)
df_dum.head()
Shape after dummies: (506, 19)
Out[17]:
Production expense Multiplex coverage Budget Movie_length Lead_ Actor_Rating Lead_Actress_rating Director_rating Producer_rating Critic_rating Trailer_views Time_taken Twitter_hastags Avg_age_actors Num_multiplex Collection 3D_available_YES Genre_Comedy Genre_Drama Genre_Thriller
Marketing expense
20.126 59.620 0.462 36,524.125 138.700 7.825 8.095 7.910 7.995 7.940 527367 109.600 223.840 23 494 48000 1 0 0 1
20.546 69.140 0.531 35,668.655 152.400 7.505 7.650 7.440 7.470 7.440 494055 146.640 243.456 42 462 43200 0 0 1 0
20.546 69.140 0.531 39,912.675 134.600 7.485 7.570 7.495 7.515 7.440 547051 147.880 2,022.400 38 458 69400 0 1 0 0
20.647 59.360 0.542 38,873.890 119.300 6.895 7.035 6.920 7.020 8.260 516279 185.360 225.344 45 472 66800 1 0 1 0
21.381 59.360 0.542 39,701.585 127.700 6.920 7.070 6.815 7.070 8.260 531448 176.480 225.792 55 395 72400 0 0 1 0
In [18]:
# Features = all columns except target
X = df_dum.drop(columns=[TARGET])
y = df_dum[TARGET].copy()

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

X_train.shape, X_test.shape
Out[18]:
((404, 18), (102, 18))
In [20]:
# Fit linear regression model
linr = LinearRegression()
linr.fit(X_train, y_train)

# Predictions
pred_tr = linr.predict(X_train)
pred_te = linr.predict(X_test)

# Metrics function (handles old sklearn versions)
def reg_metrics(y_true, y_pred, label=''):
    r2 = r2_score(y_true, y_pred)
    try:
        # Works for sklearn >=0.22
        rmse = mean_squared_error(y_true, y_pred, squared=False)
    except TypeError:
        # Fallback for older sklearn
        rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    mae = mean_absolute_error(y_true, y_pred)
    return pd.Series({'R2': r2, 'RMSE': rmse, 'MAE': mae}, name=label)

# Evaluate on train and test sets
metrics_lr = pd.concat([
    reg_metrics(y_train, pred_tr, 'Train'),
    reg_metrics(y_test, pred_te, 'Test')
], axis=1)

print("Linear Regression Performance Metrics:")
display(metrics_lr)
Linear Regression Performance Metrics:
Train Test
R2 0.698 0.609
RMSE 10,218.431 10,740.141
MAE 7,343.131 7,720.467
In [23]:
# === CELL 11 (Updated): statsmodels OLS with safe quoting for all column names ===
import statsmodels.formula.api as smf

# Ensure categoricals are typed as category for readability (not strictly required with C(), but nice)
for _col in ['3D_available', 'Genre']:
    if _col in df.columns:
        df[_col] = df[_col].astype('category')

TARGET = 'Collection'
all_cols = [c for c in df.columns if c != TARGET]

def q(col_name: str) -> str:
    """Quote a column name for Patsy when it has spaces/starts with digits."""
    return f'Q("{col_name}")'

categorical_cols = [c for c in ['3D_available', 'Genre'] if c in df.columns]

# Build RHS with quoting; wrap categoricals in C()
rhs_terms = []
for c in all_cols:
    if c in categorical_cols:
        rhs_terms.append(f'C({q(c)})')
    else:
        rhs_terms.append(q(c))

formula = f'{q(TARGET)} ~ ' + ' + '.join(rhs_terms)
print("OLS formula:\n", formula)

ols_model = smf.ols(formula=formula, data=df).fit()
ols_summary = ols_model.summary()
ols_summary
OLS formula:
 Q("Collection") ~ Q("Production expense") + Q("Multiplex coverage") + Q("Budget") + Q("Movie_length") + Q("Lead_ Actor_Rating") + Q("Lead_Actress_rating") + Q("Director_rating") + Q("Producer_rating") + Q("Critic_rating") + Q("Trailer_views") + C(Q("3D_available")) + Q("Time_taken") + Q("Twitter_hastags") + C(Q("Genre")) + Q("Avg_age_actors") + Q("Num_multiplex")
Out[23]:
OLS Regression Results
Dep. Variable: Q("Collection") R-squared: 0.686
Model: OLS Adj. R-squared: 0.674
Method: Least Squares F-statistic: 59.05
Date: Wed, 15 Oct 2025 Prob (F-statistic): 6.67e-110
Time: 21:41:37 Log-Likelihood: -5392.6
No. Observations: 506 AIC: 1.082e+04
Df Residuals: 487 BIC: 1.090e+04
Df Model: 18
Covariance Type: nonrobust
coef std err t P>|t| [0.025 0.975]
Intercept -1.485e+05 1.67e+04 -8.883 0.000 -1.81e+05 -1.16e+05
C(Q("3D_available"))[T.YES] 2350.3851 966.511 2.432 0.015 451.338 4249.432
C(Q("Genre"))[T.Comedy] 1560.6604 1524.385 1.024 0.306 -1434.523 4555.844
C(Q("Genre"))[T.Drama] 2412.2826 1663.731 1.450 0.148 -856.695 5681.260
C(Q("Genre"))[T.Thriller] 1929.0418 1494.396 1.291 0.197 -1007.218 4865.301
Q("Production expense") -135.7828 59.491 -2.282 0.023 -252.673 -18.893
Q("Multiplex coverage") 3.056e+04 1.19e+04 2.570 0.010 7200.431 5.39e+04
Q("Budget") 1.6695 0.158 10.571 0.000 1.359 1.980
Q("Movie_length") -25.4122 28.778 -0.883 0.378 -81.956 31.132
Q("Lead_ Actor_Rating") 4231.2170 7904.446 0.535 0.593 -1.13e+04 1.98e+04
Q("Lead_Actress_rating") -8099.4715 8440.037 -0.960 0.338 -2.47e+04 8483.911
Q("Director_rating") 5797.2750 8197.711 0.707 0.480 -1.03e+04 2.19e+04
Q("Producer_rating") 2800.0944 4407.892 0.635 0.526 -5860.740 1.15e+04
Q("Critic_rating") 4216.1049 753.432 5.596 0.000 2735.725 5696.484
Q("Trailer_views") 0.1135 0.011 10.575 0.000 0.092 0.135
Q("Time_taken") 33.8507 15.499 2.184 0.029 3.398 64.303
Q("Twitter_hastags") 3.3355 4.505 0.740 0.459 -5.516 12.187
Q("Avg_age_actors") 25.2103 38.125 0.661 0.509 -49.699 100.119
Q("Num_multiplex") 8.4517 11.008 0.768 0.443 -13.177 30.080
Omnibus: 155.065 Durbin-Watson: 0.950
Prob(Omnibus): 0.000 Jarque-Bera (JB): 591.355
Skew: 1.356 Prob(JB): 3.88e-129
Kurtosis: 7.549 Cond. No. 1.92e+07


Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 1.92e+07. This might indicate that there are
strong multicollinearity or other numerical problems.

Create full model with all 3 predictors¶

In [19]:
y=df.sales #outcome or target
x=df[['TV','radio', 'newspaper']] #pedictor
x=sm.add_constant(x) #adds a constant term to the predictor
In [20]:
lrmodel = sm.OLS(y,x).fit()
print(lrmodel.summary())
                            OLS Regression Results                            
==============================================================================
Dep. Variable:                  sales   R-squared:                       0.897
Model:                            OLS   Adj. R-squared:                  0.896
Method:                 Least Squares   F-statistic:                     570.3
Date:                Wed, 15 Oct 2025   Prob (F-statistic):           1.58e-96
Time:                        20:07:38   Log-Likelihood:                -386.18
No. Observations:                 200   AIC:                             780.4
Df Residuals:                     196   BIC:                             793.6
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          2.9389      0.312      9.422      0.000       2.324       3.554
TV             0.0458      0.001     32.809      0.000       0.043       0.049
radio          0.1885      0.009     21.893      0.000       0.172       0.206
newspaper     -0.0010      0.006     -0.177      0.860      -0.013       0.011
==============================================================================
Omnibus:                       60.414   Durbin-Watson:                   2.084
Prob(Omnibus):                  0.000   Jarque-Bera (JB):              151.241
Skew:                          -1.327   Prob(JB):                     1.44e-33
Kurtosis:                       6.332   Cond. No.                         454.
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
In [21]:
### drop newspaper with bad p-value of .86, create new model
y=df.sales #outcome or target
x=df[['TV','radio']] #pedictor
x=sm.add_constant(x) #adds a constant term to the predictor
lrmodel = sm.OLS(y,x).fit()
print(lrmodel.summary())
                            OLS Regression Results                            
==============================================================================
Dep. Variable:                  sales   R-squared:                       0.897
Model:                            OLS   Adj. R-squared:                  0.896
Method:                 Least Squares   F-statistic:                     859.6
Date:                Wed, 15 Oct 2025   Prob (F-statistic):           4.83e-98
Time:                        20:07:49   Log-Likelihood:                -386.20
No. Observations:                 200   AIC:                             778.4
Df Residuals:                     197   BIC:                             788.3
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          2.9211      0.294      9.919      0.000       2.340       3.502
TV             0.0458      0.001     32.909      0.000       0.043       0.048
radio          0.1880      0.008     23.382      0.000       0.172       0.204
==============================================================================
Omnibus:                       60.022   Durbin-Watson:                   2.081
Prob(Omnibus):                  0.000   Jarque-Bera (JB):              148.679
Skew:                          -1.323   Prob(JB):                     5.19e-33
Kurtosis:                       6.292   Cond. No.                         425.
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
In [22]:
# ### Model looks good: R2 is large at 89%, F is large (not 0), the p-values for TV and Radio are below .01. 
# ### Formula: is Sales =  2.9211 + TV x .0458 + Radio x .188  *** This formula is what your client is paying you for.
# ### Weaknesses: the data are a bit not normally distributed (but not terrible) - skew should be 0 but is -1.323 thus is negatively skewed (left tail long) Kurtosis should be 3 but is 6.2 which indicates a "heavy-tailed" distribution, which indicates some outliers
In [23]:
## Measure of fit performance (we will use RMSE)
from statsmodels.tools.eval_measures import rmse
ypredLR = lrmodel.predict(x)
rmse(y,ypredLR) #RMSE Root Mean Squared Error
Out[23]:
np.float64(1.6687030593661927)

3. Regression Tree using all data using sklearn DecisionTreeRegressor --------------------------¶

In [24]:
# documentation at https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeRegressor.html 
In [24]:
# Keep it simple per your constraints; depth can be capped for readability.
tree = DecisionTreeRegressor(random_state=42, max_depth=5, min_samples_leaf=5)
tree.fit(X_train, y_train)

pred_tr_tree = tree.predict(X_train)
pred_te_tree = tree.predict(X_test)

metrics_tree = pd.concat([
    reg_metrics(y_train, pred_tr_tree, 'Train'),
    reg_metrics(y_test, pred_te_tree, 'Test')
], axis=1)
metrics_tree
Out[24]:
Train Test
R2 0.865 0.805
RMSE 6,839.681 7,593.442
MAE 4,665.673 5,755.941
In [25]:
# === CELL 15: Tree Feature Importances ===
imp = pd.Series(tree.feature_importances_, index=X.columns)
imp_sorted = imp.sort_values(ascending=False)
imp_sorted.head(20)
Out[25]:
Budget                0.681
Trailer_views         0.199
Lead_Actress_rating   0.038
Director_rating       0.031
Critic_rating         0.017
Avg_age_actors        0.009
Twitter_hastags       0.007
Movie_length          0.007
Producer_rating       0.006
Production expense    0.005
Time_taken            0.000
Multiplex coverage    0.000
Lead_ Actor_Rating    0.000
Num_multiplex         0.000
3D_available_YES      0.000
Genre_Comedy          0.000
Genre_Drama           0.000
Genre_Thriller        0.000
dtype: float64

visualize in tree form¶

In [26]:
plt.figure(figsize=(18, 8))
plot_tree(tree, feature_names=X.columns, filled=True, rounded=True, fontsize=8)
plt.title('Decision Tree (depth-capped)')
plt.show()
No description has been provided for this image
In [32]:
# === TEXT TREE (fixed): avoid name collision and print rules ===

# If you previously did: from sklearn import tree
# either remove that import, or alias it:
import sklearn.tree as sktree  # safe alias if you still want the module

# Your fitted model earlier was named `tree`; let's rename it once to avoid collisions:
# If you *still* have the fitted model in memory as `tree` (estimator), do this:
try:
    # is it an estimator?
    from sklearn.tree import DecisionTreeRegressor
    if isinstance(tree, DecisionTreeRegressor):
        dt_model = tree
    else:
        raise TypeError("`tree` is not an estimator; re-fitting below.")
except Exception:
    # Refit quickly if needed (in case `tree` got overwritten by the module)
    from sklearn.tree import DecisionTreeRegressor
    dt_model = DecisionTreeRegressor(random_state=42, max_depth=5, min_samples_leaf=5)
    dt_model.fit(X_train, y_train)

# Now safely export text using the function, not the module
from sklearn.tree import export_text
text_rules = export_text(dt_model, feature_names=list(X.columns), decimals=2, show_weights=False)
print(text_rules)

# Optional shallow view for report
text_rules_shallow = export_text(dt_model, feature_names=list(X.columns), decimals=2, max_depth=3)
print("\n--- Shallow tree (max_depth=3) ---\n")
print(text_rules_shallow)

# Quick summary
print("\nTree depth:", dt_model.get_depth(), " | leaves:", dt_model.get_n_leaves())
|--- Budget <= 37982.31
|   |--- Trailer_views <= 440392.00
|   |   |--- Director_rating <= 8.79
|   |   |   |--- Movie_length <= 153.55
|   |   |   |   |--- Critic_rating <= 7.81
|   |   |   |   |   |--- value: [38822.22]
|   |   |   |   |--- Critic_rating >  7.81
|   |   |   |   |   |--- value: [44218.18]
|   |   |   |--- Movie_length >  153.55
|   |   |   |   |--- Producer_rating <= 8.55
|   |   |   |   |   |--- value: [31233.33]
|   |   |   |   |--- Producer_rating >  8.55
|   |   |   |   |   |--- value: [36230.00]
|   |   |--- Director_rating >  8.79
|   |   |   |--- Trailer_views <= 386160.50
|   |   |   |   |--- Critic_rating <= 8.02
|   |   |   |   |   |--- value: [21186.05]
|   |   |   |   |--- Critic_rating >  8.02
|   |   |   |   |   |--- value: [30088.89]
|   |   |   |--- Trailer_views >  386160.50
|   |   |   |   |--- Lead_Actress_rating <= 9.22
|   |   |   |   |   |--- value: [29900.00]
|   |   |   |   |--- Lead_Actress_rating >  9.22
|   |   |   |   |   |--- value: [37771.43]
|   |--- Trailer_views >  440392.00
|   |   |--- Lead_Actress_rating <= 9.29
|   |   |   |--- Budget <= 36346.37
|   |   |   |   |--- Budget <= 33677.19
|   |   |   |   |   |--- value: [40854.24]
|   |   |   |   |--- Budget >  33677.19
|   |   |   |   |   |--- value: [45274.42]
|   |   |   |--- Budget >  36346.37
|   |   |   |   |--- Production expense <= 62.22
|   |   |   |   |   |--- value: [60516.67]
|   |   |   |   |--- Production expense >  62.22
|   |   |   |   |   |--- value: [51756.52]
|   |   |--- Lead_Actress_rating >  9.29
|   |   |   |--- value: [75240.00]
|--- Budget >  37982.31
|   |--- Budget <= 41312.54
|   |   |--- Trailer_views <= 474030.50
|   |   |   |--- value: [49114.29]
|   |   |--- Trailer_views >  474030.50
|   |   |   |--- Avg_age_actors <= 36.50
|   |   |   |   |--- Budget <= 39348.84
|   |   |   |   |   |--- value: [57757.14]
|   |   |   |   |--- Budget >  39348.84
|   |   |   |   |   |--- value: [65928.57]
|   |   |   |--- Avg_age_actors >  36.50
|   |   |   |   |--- Producer_rating <= 7.05
|   |   |   |   |   |--- value: [66566.67]
|   |   |   |   |--- Producer_rating >  7.05
|   |   |   |   |   |--- value: [76022.22]
|   |--- Budget >  41312.54
|   |   |--- Critic_rating <= 7.48
|   |   |   |--- value: [74800.00]
|   |   |--- Critic_rating >  7.48
|   |   |   |--- Twitter_hastags <= 227.31
|   |   |   |   |--- value: [81200.00]
|   |   |   |--- Twitter_hastags >  227.31
|   |   |   |   |--- Producer_rating <= 8.65
|   |   |   |   |   |--- value: [93025.00]
|   |   |   |   |--- Producer_rating >  8.65
|   |   |   |   |   |--- value: [99700.00]


--- Shallow tree (max_depth=3) ---

|--- Budget <= 37982.31
|   |--- Trailer_views <= 440392.00
|   |   |--- Director_rating <= 8.79
|   |   |   |--- Movie_length <= 153.55
|   |   |   |   |--- truncated branch of depth 2
|   |   |   |--- Movie_length >  153.55
|   |   |   |   |--- truncated branch of depth 2
|   |   |--- Director_rating >  8.79
|   |   |   |--- Trailer_views <= 386160.50
|   |   |   |   |--- truncated branch of depth 2
|   |   |   |--- Trailer_views >  386160.50
|   |   |   |   |--- truncated branch of depth 2
|   |--- Trailer_views >  440392.00
|   |   |--- Lead_Actress_rating <= 9.29
|   |   |   |--- Budget <= 36346.37
|   |   |   |   |--- truncated branch of depth 2
|   |   |   |--- Budget >  36346.37
|   |   |   |   |--- truncated branch of depth 2
|   |   |--- Lead_Actress_rating >  9.29
|   |   |   |--- value: [75240.00]
|--- Budget >  37982.31
|   |--- Budget <= 41312.54
|   |   |--- Trailer_views <= 474030.50
|   |   |   |--- value: [49114.29]
|   |   |--- Trailer_views >  474030.50
|   |   |   |--- Avg_age_actors <= 36.50
|   |   |   |   |--- truncated branch of depth 2
|   |   |   |--- Avg_age_actors >  36.50
|   |   |   |   |--- truncated branch of depth 2
|   |--- Budget >  41312.54
|   |   |--- Critic_rating <= 7.48
|   |   |   |--- value: [74800.00]
|   |   |--- Critic_rating >  7.48
|   |   |   |--- Twitter_hastags <= 227.31
|   |   |   |   |--- value: [81200.00]
|   |   |   |--- Twitter_hastags >  227.31
|   |   |   |   |--- truncated branch of depth 2


Tree depth: 5  | leaves: 22
In [29]:
# === CELL 17 (Updated): compare models (sklearn LR vs Tree, compat with older sklearn) ===
import numpy as np
import pandas as pd
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

def rmse_compat(y_true, y_pred):
    """RMSE that works on old/new scikit-learn."""
    try:
        return mean_squared_error(y_true, y_pred, squared=False)  # sklearn >= 0.22
    except TypeError:
        return np.sqrt(mean_squared_error(y_true, y_pred))        # fallback

summary = pd.DataFrame({
    'LR_Test_R2':    [r2_score(y_test, pred_te)],
    'LR_Test_RMSE':  [rmse_compat(y_test, pred_te)],
    'LR_Test_MAE':   [mean_absolute_error(y_test, pred_te)],

    'Tree_Test_R2':   [r2_score(y_test, pred_te_tree)],
    'Tree_Test_RMSE': [rmse_compat(y_test, pred_te_tree)],
    'Tree_Test_MAE':  [mean_absolute_error(y_test, pred_te_tree)],
})

# If you ran the optional log1p model, add its back-transformed metrics (handle Series or DataFrame cases)
try:
    if 'metrics_lr_log' in globals():
        if isinstance(metrics_lr_log, pd.Series):
            summary['LR_log1p_Test_R2(back)']    = [metrics_lr_log['R2']]
            summary['LR_log1p_Test_RMSE(back)']  = [metrics_lr_log['RMSE']]
            summary['LR_log1p_Test_MAE(back)']   = [metrics_lr_log['MAE']]
        elif isinstance(metrics_lr_log, pd.DataFrame):
            s = metrics_lr_log['Test (Back-Transformed)']
            summary['LR_log1p_Test_R2(back)']    = [s['R2']]
            summary['LR_log1p_Test_RMSE(back)']  = [s['RMSE']]
            summary['LR_log1p_Test_MAE(back)']   = [s['MAE']]
except Exception:
    pass

print("Model comparison (lower RMSE/MAE is better, higher R² is better):")
display(summary.T)
Model comparison (lower RMSE/MAE is better, higher R² is better):
0
LR_Test_R2 0.609
LR_Test_RMSE 10,740.141
LR_Test_MAE 7,720.467
Tree_Test_R2 0.805
Tree_Test_RMSE 7,593.442
Tree_Test_MAE 5,755.941

Measure of fit performance (we will use RMSE)¶

In [33]:
# === CELL 18 (Updated): predict for a new (hypothetical) movie ===

# 1) Inspect the training categories so we can match them
cat_cols = [c for c in ['3D_available', 'Genre'] if c in df.columns]
cat_levels = {c: list(df[c].astype('category').cat.categories) for c in cat_cols}
print("Training categories:", cat_levels)

# 2) Define a new movie (ensure values exist in the 'Training categories' above)
new_movie = {
    'Marketing expense': 2_000_000,
    'Production expense': 5_500_000,
    'Multiplex coverage': 180,
    'Budget': 7_500_000,
    'Movie_length': 115,
    'Lead_ Actor_Rating': 7.5,     # note the exact column name (with space after underscore)
    'Lead_Actress_rating': 7.2,
    'Director_rating': 8.0,
    'Producer_rating': 7.0,
    'Critic_rating': 6.8,
    'Trailer_views': 1_200_000,
    '3D_available': 'YES',         # must be one of cat_levels['3D_available']
    'Time_taken': 200,
    'Twitter_hastags': 2_500,      # exact spelling matches your CSV
    'Genre': 'Action',             # must be one of cat_levels['Genre']
    'Avg_age_actors': 34,
    'Num_multiplex': 190
}
new_df = pd.DataFrame([new_movie])

# 3) Force the same categorical levels as training (prevents dummy misalignment)
for c in cat_cols:
    if c in new_df.columns:
        new_df[c] = pd.Categorical(new_df[c], categories=cat_levels[c])

# 4) One-hot encode using the same approach as training
new_dum = pd.get_dummies(new_df, columns=cat_cols, drop_first=True, dtype=int)

# 5) Align columns with the training design matrix X (fills missing dummies with 0)
new_dum_aligned = new_dum.reindex(columns=X.columns, fill_value=0)

# 6) Predict with both models
lr_pred_val = linr.predict(new_dum_aligned)[0]

# If you kept the tree as `tree`, use that; if you renamed per earlier fix, use `dt_model`
try:
    tree_pred_val = dt_model.predict(new_dum_aligned)[0]
except NameError:
    tree_pred_val = tree.predict(new_dum_aligned)[0]

print(f"Predicted Collection (Linear Regression): {lr_pred_val:,.0f}")
print(f"Predicted Collection (Decision Tree):     {tree_pred_val:,.0f}")
Training categories: {'3D_available': ['NO', 'YES'], 'Genre': ['Action', 'Comedy', 'Drama', 'Thriller']}
Predicted Collection (Linear Regression): -610,565,264
Predicted Collection (Decision Tree):     74,800

Optional: more on tree visualization https://mljar.com/blog/visualize-decision-tree/¶

Very Fancy Tree¶

In [ ]:
# For best results - install dtreeviz using Anaconda Navigator (which also installs the dependent packages) instead of pip below
In [ ]:
# pip install dtreeviz
In [ ]:
# pip install python-graphviz
In [ ]:
import dtreeviz
In [ ]:
# https://github.com/parrt/dtreeviz/blob/master/notebooks/dtreeviz_sklearn_visualisations.ipynb
viz = dtreeviz.model(regtreemodel2, x, y,target_name="sales", feature_names = ["TV","radio", "newspaper"])
In [ ]:
viz.view()
In [ ]:
viz.view(orientation="LR")
In [ ]:
viz.view(fancy=False)